Daily stock prices the last five years for each stock (if available, some of the companies don’t have that long history on the stock market) will be extracted from Yahoo! Finance automatically. Stocks that don’t have three years of history will be excluded. In the case of any of the stocks are missing, the stock data will not be extracted from other sources to not complicate the matter. After the automatic download, the data will be stored in .csv files, one for each stock, where the information later can be extracted and as an extra backup.
The reason for this is that Yahoo! Finance has turned out to be less reliable, with less consistent data, problems in downloading it and closing its API service, after the acquisition by Verizon in 2016. Other sources such as Google, Quandl, Alpha vantage, Nordnet, Avanza, and some other Swedish alternatives, have been consulted but with even worse alternatives.
The data will include information such as date, open and closing price for each particular day, high and low, volume, as well as adjusted closing price. The latter is adjusted so that it takes into account changes in the stock price due to splits and dividends, while the rest only is adjusted to splits. It is crucial that the input data takes into account splits, because a stock’s price can be reduced (or increased) ten times through a split without having an effect on the company’s value. Running an algorithm on such data would probably result in worthless results.
####### For reproducible results
#from numpy.random import seed
#seed(1)
#from tensorflow import set_random_seed
#set_random_seed(2)
#######
import pandas as pd
import copy
import datetime
from pandas_datareader import data
import time
from retrying import retry
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, IndexFormatter
import numpy as np
import math
import operator
##### Introduce some useful functions ####
# Yahoo! Finance isn't that reliable and might throw a RemoteDataError if we try to get the data too fast.
# For this reason, add a
#@retry function to minimize the risk of RemoteDataError. 10 retries, with one
# second pause in between each run.
@retry(stop_max_attempt_number=10)
def get_stock_data(ticker, dates):
try:
#stock = pd.DataFrame(index=dates) #
stock = data.get_data_yahoo(ticker, start, end)
stock.sort_index(ascending=False, inplace=True)
return stock
except RemoteDataError:
time.sleep(1)
print('Trying again..')
def create_file_path(ticker):
"""Create a file path to store file(s)"""
base = '/Users/jakob/Desktop/Programming/Udacity Machine Learning Nano Degree/Capstone Project/Data_Capstone/'
return(base + ticker + '.csv')
def fill_missing_values(data_df):
"""Fill missing values forward, then backwards"""
data_df.fillna(method="ffill", inplace=True)
data_df.fillna(method="bfill", inplace=True)
start = datetime.datetime(2012,12,27)
end = datetime.date.today()
dates = pd.date_range(start, end)
stock_names = ['OMX Stockholm 30', 'Acando B', 'Addnode Group B', 'Addtech B', 'Africa Oil',
'AQ Group', 'Arcam', 'Beijer Alma', 'Beijer Ref', 'BioGaia B', 'Biotage',
'BlackPearl R. Inc.','Bulten', 'Bure Equity','Byggmax','Catella A','Catella B',
'Catena', 'Cavotec SA','CellaVision', 'Clas Ohlson B', 'Cloetta B',
'Concentric', 'Creades A', 'Diös Fastigheter', 'Duni', 'Elanders B',
'EnQuest PLC', 'Fagerhult', 'Fast Partner','G5 Entertainment', 'Gunnebo',
'Haldex', 'Hansa Medical', 'Heba B', 'HiQ International', 'HMS Networks',
'IAR Systems', 'INVISIO Communications', 'Kabe B', 'KappAhl', 'Karo Pharma',
'Knowit','Lindab International','Lucara Diamond Corp.','Medivir B','Mekonomen',
'Midsona A', 'Midsona B', 'Mycronic', 'Nederman Holding', 'Net Insight B',
'New Wave B', 'Nolato B', 'OEM International B', 'Opus Group','Orexo', 'Probi',
'Qliro Group', 'RaySearch Laboratories B', 'Rezidor Hotel Group', 'SAS',
'Semafo', 'SkiStar B', 'Starbreeze B','Swedol B','Systemair','Tethys Oil',
'Traction B', 'VBG Group B','Vitrolife','Xvivo Perfusion','Öresund Investment']
tickers = ['^OMX', 'ACAN-B.ST', 'ANOD-B.ST', 'ADDT-B.ST', 'AOI.ST', 'AQ.ST', 'ARCM.ST',
'BEIA-B.ST', 'BEIJ-B.ST', 'BIOG-B.ST', 'BIOT.ST', 'PXXS-SDB.ST','BULTEN.ST',
'BURE.ST', 'BMAX.ST', 'CAT-A.ST', 'CAT-B.ST', 'CATE.ST', 'CCC.ST', 'CEVI.ST',
'CLAS-B.ST', 'CLA-B.ST', 'COIC.ST', 'CRED-A.ST', 'DIOS.ST', 'DUNI.ST', 'ELAN-B.ST',
'ENQ.ST', 'FAG.ST', 'FPAR.ST', 'G5EN.ST', 'GUNN.ST', 'HLDX.ST', 'HMED.ST',
'HEBA-B.ST', 'HIQ.ST', 'HMS.ST', 'IAR-B.ST', 'IVSO.ST', 'KABE-B.ST', 'KAHL.ST',
'KARO.ST', 'KNOW.ST', 'LIAB.ST', 'LUC.ST', 'MVIR-B.ST','MEKO.ST', 'MSON-A.ST',
'MSON-B.ST', 'MYCR.ST', 'NMAN.ST', 'NETI-B.ST', 'NEWA-B.ST','NOLA-B.ST','OEM-B.ST',
'OPUS.ST', 'ORX.ST', 'PROB.ST', 'QLRO.ST', 'RAY-B.ST', 'REZT.ST', 'SAS.ST',
'SMF.ST', 'SKIS-B.ST', 'STAR-B.ST', 'SWOL-B.ST', 'SYSR.ST', 'TETY.ST', 'TRAC-B.ST',
'VBG-B.ST', 'VITR.ST', 'XVIVO.ST', 'ORES.ST']
Many of the stocks are missing data values. To fix this, pandas' inbuilt functions 'fill forward' and 'fill backwards' are used. They are used and defined in the function fill_missing_values() above. This is a common approach when dealing with incomplete times series data.
### Download and save the data into .csv files ###
#time1 = time.time()
#for i in tickers:
# stock_df = get_stock_data(i, dates)
# # Fill missing values forward, then, fill backward
# fill_missing_values(stock_df)
# # Save the files as .csv as well
# stock_df.to_csv(create_file_path(i))
#print("Total time to download the data: {0:0.0f} s".format(time.time() - time1))
#display(stock_df.head())
What's happening below? First, all the .csv files are loaded into dataframes and stored in a list called file_names. Each dataframe contains all the data for one stock. Consequently, each position in the list contains the stock data for one stock. Further on, the stock name is added as a new column in each stock dataframe.
Set the index column to the Date column when using .read_csv. This will later facilitate when normalizing the data.
import os
from glob import glob
# Count the number of files in the input data directory
directory = '/Users/jakob/Desktop/Programming/Udacity Machine Learning Nano Degree/Capstone Project/Data_Capstone/'
file_paths = glob(directory+"*.csv") # Get each .csv file in the directory
# Get all the file names
file_names = []
for root, dirs, files in os.walk(directory):
for filename in files:
filename = filename[:-4] # Just keep the ticker name, without the .csv file extention
file_names.append(filename)
del file_names[0]
# Define a common index for all dataframes
m = pd.read_csv(file_paths[0], index_col='Date')
glob_index = m.index
# Get the input data from the .csv files
loaded_stocks = []
for i in range(len(file_paths)):
stock = pd.read_csv(file_paths[i], index_col='Date')
stock['Volatility'] = (stock['High'] - stock['Low']) / stock['Open'] # Calculate the volatility
stock.index.names = [file_names[i] + '__' + 'Date'] # Change the index name to stock name + Date
loaded_stocks.append(stock)
dim = loaded_stocks[1].shape
print("Total amount of input data points: {0}".format(dim[0] * dim[1] * len(loaded_stocks)))
print("Number of stocks: ", len(loaded_stocks))
#print(this_stock.to_string()) # print the entire dataframe
display(loaded_stocks[1].head())
#print(loaded_stocks[1].index.name[:-6])
Normalize the stock data according to the first date (2012-12-27) in each stock dataframe.
Display the top and bottom five values, both unchanged and normalized.
def normalize_data(prices):
""" Normalize data stored in prices"""
if isinstance(prices, pd.DataFrame): # Check if Dataframe
prices = prices/prices.iloc[-1]#normalize according to the first date value (which now is in the end of the df)
else: # if array
prices = prices/prices[-1]
return prices
# Normalize all the stock prices
#col = ['Open', 'High', 'Low', 'Close', 'Adj Close']
norm_stock_prices = []
for i in loaded_stocks:
norm_d = normalize_data(i)
norm_d[norm_d == np.inf] = 0 # if any of the value in norm_d is an infinite value, set it equal to 0
fill_missing_values(norm_d)
norm_d = norm_d[~norm_d.index.duplicated(keep='last')] # Remove duplicated indices (if any)
norm_stock_prices.append(norm_d)
#display(loaded_stocks[1].head(5), loaded_stocks[1].iloc[-5:, :])
display(norm_stock_prices[1].head(), norm_stock_prices[1].iloc[-5:, :])
print(len(norm_stock_prices[1]))
#for tick in tickers:
# display(get_stock(norm_stock_prices, tick).head())
# Define some more useful functions.
def get_stock(stock_list ,ticker):
""" Returns the dataframe containing the stock with the ticker symbol ticker.
stock_list is a list of stock data stored in dataframes. """
for sd in stock_list:
if sd.index.name[:-6] == ticker:
return sd
def get_ticker(DataFrame):
"""Return the ticker symbol for the stock in DataFrame"""
return DataFrame.index.name[:-6]
# Just check so that it works as intended
starbreeze_df = get_stock(loaded_stocks ,'STAR-B.ST')
display(starbreeze_df.head())
Define functions for plotting the dataframes. There might exist better and easier solutions, but in order to get acceptable plots, quite some adjustments are needed.
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
# Define a function for plotting a dataframe
def plot_it(data_df, title='', xlabel='', ylabel='', legend=''):
"""Plot the stock stored in data_df, title is the plot title"""
plt.figure(figsize=(10, 5))
pl = data_df.loc[::-1].plot(fontsize=12, figsize=(13, 5))
pl.set_title(label=title, fontsize=20)
pl.set_xlabel(xlabel, fontsize=15)
plt.autoscale(enable=True, axis='x', tight=True)
pl.set_ylabel(ylabel, fontsize=15)
plt.legend(fontsize=12, loc='upper left') # [legend],
plt.grid(axis='both', alpha=.5)
pl.xaxis.set_major_locator(MaxNLocator(12))
pl.xaxis.set_major_formatter(IndexFormatter(data_df.index[::-1]))
plt.xticks(rotation=50, horizontalalignment='center', rotation_mode='default')
plt.show()
# Define a function for plotting two dataframes in different colours in the same plot
def plot_2it(data_df1, data_df2, label1='', label2='', title=''):
"""Plot the stock data stored in data_df1 and data_df2 in different colors.
label1 and label2 are the label names while title is the plot title"""
plt.figure(figsize=(10, 5))
pl = data_df1.loc[::-1].plot(fontsize=12, figsize=(13, 5), label=label1, color='green')
plt.plot([None for i in data_df1.loc[::-1]] + [x for x in data_df2.loc[::-1]], label=label2, color='royalblue')
pl.set_title(label=title, fontsize=20)
pl.set_xlabel('Date', fontsize=15)
plt.autoscale(enable=True, axis='x', tight=True)
pl.set_ylabel('Price', fontsize=15)
plt.legend(fontsize=12, loc='upper left')
plt.grid(axis='both', alpha=.5)
pl.xaxis.set_major_locator(MaxNLocator(12))
temp = pd.concat([data_df2, data_df1], axis=1)
pl.xaxis.set_major_formatter(IndexFormatter(temp.index))
plt.xticks(rotation=50, horizontalalignment='center', rotation_mode='default')
plt.show()
# Define a function for plotting two dataframes in different colours in the same plot
def plot_3it(data_df1, data_df2, data_df3, label1='', label2='', label3='', title=''):
"""Plot the stock data stored in data_df1 and data_df2 in different colors.
The entire dataset is stored in data_df3.
label1, label2 and label3 are the label names while title is the plot title"""
plt.figure(figsize=(10, 5))
pl = data_df1.loc[::-1].plot(fontsize=12, figsize=(13, 5), label=label1, color='green')
plt.plot([None for i in data_df1.loc[::-1]] + [x for x in data_df2.loc[::-1]], label=label2, color='royalblue')
plt.plot(data_df3.loc[::-1], label=label3, color='darkorange')
pl.set_title(label=title, fontsize=20)
pl.set_xlabel('Date', fontsize=15)
plt.autoscale(enable=True, axis='x', tight=True)
pl.set_ylabel('Price', fontsize=15)
plt.legend(fontsize=12, loc='upper left')
plt.grid(axis='both', alpha=.5)
pl.xaxis.set_major_locator(MaxNLocator(12))
temp = pd.concat([data_df2, data_df1], axis=1)
pl.xaxis.set_major_formatter(IndexFormatter(temp.index))
plt.xticks(rotation=50, horizontalalignment='center', rotation_mode='default')
plt.show()
# Plot the stocks and compare every single one with the OMX Stockholm 30 index
count = 0
for stock in norm_stock_prices[1:]:
temp_df = pd.concat([stock.loc[:, 'Adj Close'], norm_stock_prices[0].loc[:, 'Adj Close']],
keys=[get_ticker(stock), get_ticker(norm_stock_prices[0])], axis=1)
fill_missing_values(temp_df)
concat_df = temp_df.set_index(glob_index) # Set index equal to OMX's index
if concat_df.iloc[-1,0] != 1.0: # If the dataframe is inverted, correct it.
concat_df = concat_df[::-1]
#concat_df = concat_df.reverse() # reverse() can also be used
concat_df.index = concat_df.index[::-1]
#if count <= 10:
# plot_it(concat_df, xlabel='Date', ylabel='Price')
count += 1
They all seem to be correct. So, we have downloaded, imported, normalized, filled empty values and plotted data for 73 different stocks, OMX Stockholm 30 included, successfully.
MinMaxScale, do a train-test split and plot the resulting plot for the first 10 stocks. Different colors for train and test set.
# Create a copy to keep scaled and normalized data apart. [Have to use copy.deepcopy()]
scaled_LOG_stock_prices = copy.deepcopy(norm_stock_prices)
from sklearn.preprocessing import MinMaxScaler
##### FÖR ATT MAN SKA KUNNA ANVÄNDA fit_transform SÅ MÅSTE INPUTDATAN VARA MAX 2D
#We have to specify one scaler for each column. There are different min and max values in each column and
#the MinMaxScaler will therefore be tuned slightly different for each one of them.
#It is needed to get the correct output later on.
# Create a MinMaxScaler for each column in each stock and store it in the dictionary scaled_stock_prices.
# The name of the scaler is the stock ticker + column number.
# [0, 1, 2, 3, 4, 5, 6] <=> ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Volatility']
#Specify one scaler for each column and stock
many_MinMaxScalers = {}
for i_s in range(len(norm_stock_prices)):
for j_s in range(7):
many_MinMaxScalers["{0}".format(get_ticker(norm_stock_prices[i_s])+str(j_s))] = MinMaxScaler(feature_range=(0,1))
print(len(many_MinMaxScalers))
def MMscale_data(data):
""" A function for scaling the data in the dataframe data"""
for i in range(len(data.columns)):
data.iloc[:,i] = many_MinMaxScalers[get_ticker(data)+str(i)].fit_transform(data.iloc[:,i].values.reshape(-1,1))
return data
def Un_scale_data(data, ticker=' '):
""" A function for unscaling the data in the variable data.
No ticker is needed if data is a dataframe while it is needed if data is an array. """
if ticker == ' ' or isinstance(data, pd.DataFrame): # if dataframe
data.iloc[:,] = many_MinMaxScalers[get_ticker(data)+str(4)].inverse_transform(data.iloc[:,].values.reshape(-1,1))
else: # is an array
data = many_MinMaxScalers[ticker+str(4)].inverse_transform(data.reshape(-1,1))
return data
def Un_scale_data_whole(data):
""" A function for unscaling the data in the dataframe data"""
for c in range(len(data.columns)):
data.iloc[:,c] = many_MinMaxScalers[get_ticker(data)+str(c)].inverse_transform(data.iloc[:,c].values.reshape(-1,1))
return data
LOG_norm_train_list, LOG_norm_test_list = [], []
LOG_scaled_train_list, LOG_scaled_test_list = [], []
# Plot the first 10 stocks, OMX 30 excluded
for i in range(1, len(scaled_LOG_stock_prices)):
test_size = int(len(scaled_LOG_stock_prices[i]) * 0.20) # Specify the test size
MMscale_data(scaled_LOG_stock_prices[i])
# Scaled and normalized
LOG_train, LOG_test = scaled_LOG_stock_prices[i][test_size:], scaled_LOG_stock_prices[i][0:test_size]
# save each one into a list
LOG_scaled_train_list.append(LOG_train)
LOG_scaled_test_list.append(LOG_test)
# Normalized
LOG_norm_train, LOG_norm_test = norm_stock_prices[i][test_size:], norm_stock_prices[i][0:test_size]
LOG_norm_train_list.append(LOG_norm_train)
LOG_norm_test_list.append(LOG_norm_test)
#if i <= 10:
# plot_2it(LOG_norm_train.loc[:, 'Adj Close'], LOG_norm_test.loc[:, 'Adj Close'],
# 'Training set', 'Test set', get_ticker(norm_stock_prices[i]))
print("Training samples: {0}".format(len(LOG_train)))
print("Testing samples: {0}".format(len(LOG_test)))
#for k in range(len(LOG_norm_train_list)):
# plot_3it(LOG_norm_train_list[k].loc[:, 'Adj Close'], LOG_norm_test_list[k].loc[:, 'Adj Close'],
# norm_stock_prices[0].loc[:, 'Adj Close'], 'Training set', 'Test set', 'OMX30',
# title = get_ticker(LOG_norm_train_list[k]))
display(LOG_norm_test_list[0].iloc[-3:, :], LOG_norm_train_list[0].iloc[:3, :])
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, LeakyReLU
from keras import optimizers
from keras import regularizers
np.set_printoptions(threshold=1000)
""" Define and compile a simple logistic regression model """
def logistic_regression_model(output_size, neurons, activ_func='relu',
optimizer='adam', loss='mean_squared_error'):
model = Sequential()
model.add(Dense(output_size, activation=activ_func, input_shape=(7,)))
#model.add(Dropout(dropout)) ### Bättre resultat utan Dropout (innan)
model.compile(optimizer = optimizer, loss = loss) #, metrics=['accuracy'])
model.summary()
return model
Our training input data consist of 1037 rows and 7 columns while the training output consists of 1037 rows and one column ('Adjusted Close'). Likewise, the testing data consist of 258 rows and seven columns in the input and one as for the output. The reason to chose only one output column is that we are really only interested in predicting the closing adjusted price of the stock.
# Create the datasets. Training and testing inputs as well as outputs.
LOG_train_inputs = copy.deepcopy(LOG_scaled_train_list[0][::-1][:-1]) # Remove the last day
LOG_train_outputs = copy.deepcopy(LOG_scaled_train_list[0][::-1].iloc[1:, :]) # Move 1 day ahead and choose
# Adjusted Close as only output
LOG_test_inputs = copy.deepcopy(LOG_scaled_test_list[0][::-1][:-1])
LOG_test_outputs = copy.deepcopy(LOG_scaled_test_list[0][::-1].iloc[1:, :])
print(LOG_train_inputs.shape)
print(LOG_train_outputs.shape)
print()
print(LOG_test_inputs.shape)
print(LOG_test_outputs.shape)
# Random seed for reproducibility
np.random.seed(45)
# Build the model architecture
LOG_model = logistic_regression_model(output_size=7, neurons=30)
# Train the model
trained_model = LOG_model.fit(LOG_train_inputs, LOG_train_outputs, epochs=20, batch_size=1,
verbose=2, shuffle=True, validation_split=0.05)
We would expect this to decrease over time.
def plot_error(model):
""" Plot the error and some statistics. """
fig, ax1 = plt.subplots(1,1, figsize=(10, 5))
ax1.plot(model.epoch, model.history['loss'])
ax1.set_title('Training Error')
ax1.set_ylabel('Loss',fontsize=12)
ax1.set_xlabel('# Epochs',fontsize=12)
plt.show()
# Plot the error
plot_error(trained_model)
trainScore = LOG_model.evaluate(LOG_train_inputs, LOG_train_outputs, verbose=0)
testScore = LOG_model.evaluate(LOG_test_inputs, LOG_test_outputs, verbose=0)
print("Mean Squared Error on the training data: {0:0.5f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.5f}".format(testScore))
Now, check how our model performs on the training and test sets by plotting the real and predicted values and compare them. A zoomed in plot is also used. The idea is taken from this source: http://akuederle.com/matplotlib-zoomed-up-inset
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
# A function for plotting a dataframe with a zoomed in plot
def plot_zoom(data_df, title='', xlabel='', ylabel=''):
"""Plot the stock stored in data_df, title is the plot title"""
plt.figure(figsize=(10, 5))
pl = data_df.loc[::-1].plot(fontsize=12, figsize=(13, 5)) # legend=None,
pl.set_title(label=title, fontsize=20)
pl.set_xlabel(xlabel, fontsize=15)
plt.autoscale(enable=True, axis='x', tight=True)
pl.set_ylabel(ylabel, fontsize=15)
plt.legend(fontsize=12, loc='upper left')
plt.grid(axis='both', alpha=.5)
pl.xaxis.set_major_locator(MaxNLocator(12))
pl.xaxis.set_major_formatter(IndexFormatter(data_df.index[::-1]))
plt.xticks(rotation=50, horizontalalignment='center', rotation_mode='default')
# The zoomed in window
lg = int(len(data_df)*0.1)
axins = zoomed_inset_axes(pl, 2.5, loc=9)
axins.plot(data_df.loc[::-1])
x1, x2 = data_df.index[lg,], data_df.index[0] # specify the limits
y1 = data_df.loc[data_df.index[0]:data_df.index[lg],'True Values'].min()
y2 = data_df.loc[data_df.index[0]:data_df.index[lg],'True Values'].max()
axins.set_xlim(x1, x2), axins.set_ylim(y1, y2) # apply the x-limits, apply the y-limits
axins.set_facecolor('whitesmoke')
axins.axis[:].set_visible(False) # Remove the 4 borders
mark_inset(pl, axins, loc1=2, loc2=4, fc="none", ec="1.5") # Add some lines for the zoom effect
plt.show()
# A function for plotting three dataframes and a zoomed in plot
def plot_3zoom(data_df1, data_df2, data_df3, title='', xlabel='', ylabel='', zoom=True):
"""data_df1 contains the train set, data_df2 contains the test set and data_df3 contains the entire dataset.
title is the plot title. If a zoomed in window is desired, set zoom to True"""
line_w, line_zoom = 1.0, 1.5 # line width for the main and zoomed plot
# Plot the predicted train and test data
diff = len(data_df3)-len(data_df2)-len(data_df1)
pl = data_df1.plot(color='orchid', fontsize=12, figsize=(16, 7), label=data_df1.columns[0], linewidth=line_w)
pred = np.empty_like(data_df3)
pred[:, :] = np.nan
pred[len(data_df1)+diff:len(data_df3), :] = data_df2
plt.plot(pred, color='darkorange', label=data_df2.columns[0], linewidth=line_w)
# Plot the actual values
plt.plot(data_df3, color='green', label=data_df3.columns[0], linewidth=line_w)
pl.set_title(label=title, fontsize=20)
pl.set_xlabel(xlabel, fontsize=15)
plt.autoscale(enable=True, axis='x', tight=True)
pl.set_ylabel(ylabel, fontsize=15)
plt.legend(fontsize=12, loc='upper left')
plt.grid(axis='both', alpha=.5)
pl.xaxis.set_major_locator(MaxNLocator(12))
pl.xaxis.set_major_formatter(IndexFormatter(data_df3.index[::-1]))
plt.xticks(rotation=50, horizontalalignment='center', rotation_mode='default')
if zoom:
## The zoomed in window
lg = int(len(data_df3)*0.1)
axins = zoomed_inset_axes(pl, 2.5, loc=9)
axins.plot(data_df1.iloc[::-1], color='orchid', linewidth=line_zoom)
axins.plot(pred, color='darkorange', label=data_df2.columns[0], linewidth=line_zoom)
axins.plot(data_df3.iloc[::-1], color='green', linewidth=line_zoom)
x1, x2 = data_df1[::-1].index[-lg//2,], data_df2[::-1].index[lg//2] # specify the limits
# Check for the max and min y values in the actual values, within the x limits.
yA1 = data_df3.loc[data_df3.index[test_size-lg//2]:data_df3.index[test_size+lg//2],'Actual Data'].min()
yA2 = data_df3.loc[data_df3.index[test_size-lg//2]:data_df3.index[test_size+lg//2],'Actual Data'].max()
# Check for the max and min y values in the train set, within the x limits.
yTr1 = data_df1.iloc[-lg//2:, 0].min()
yTr2 = data_df1.iloc[-lg//2:, 0].max()
# Check for the max and min y values in the test set, within the x limits.
yTe1 = data_df2.iloc[:lg//2, 0].min()
yTe2 = data_df2.iloc[:lg//2, 0].max()
ys = [yA1, yA2, yTr1, yTr2, yTe1, yTe2]
ymax, ymin = max(ys), min(ys) # find the max and min values among the different y's
axins.set_xlim(x1, x2), axins.set_ylim(ymin, ymax) # apply the x-limits, apply the y-limits
axins.set_facecolor('whitesmoke')
axins.axis[:].set_visible(False) # Remove the 4 borders
mark_inset(pl, axins, loc1=2, loc2=4, fc="none", ec="1.5") # Add some lines for the zoom effect
plt.show()
df1 = pd.DataFrame(data=(np.transpose(LOG_model.predict(LOG_norm_train_list[0][:-1].values)))[0],
index=LOG_train_inputs.index, columns=['Predictions on the Train set'])
df2 = pd.DataFrame(data=(np.transpose(LOG_model.predict(LOG_norm_test_list[0][:-1].values)))[0],
index=LOG_test_inputs.index, columns=['Predictions on the Test set'])
df3 = pd.DataFrame(data=norm_stock_prices[1].loc[:, 'Adj Close'])
df3.columns = ['Actual Data']
name = get_ticker(LOG_norm_train_list[0])
plot_3zoom(df1[::-1], df2[::-1], df3, title='Logistic Regression Performance on the Training and Test Sets, ' + name,
xlabel='Date', ylabel='Price', zoom=False)
In the above figure, the actual values are plotted in green while the blue line represents the predicted values for the training set and the orange the predicted values for the test set. It seems to perform fairly poor on both the sets, predicting nothing more than the previous day's value.
def plot_some_LOG_models():
# LOG_train_list and LOG_test_list contains the scaled stock values.
global_time = time.time()
nbr = 1
for i in range(1, len(LOG_scaled_train_list)):
print('===================')
print('Plot: {0} (out of {1})'.format(nbr, len(LOG_scaled_train_list)-1))
print('===================')
LOG_train_inputs = copy.deepcopy(LOG_scaled_train_list[i][::-1][:-1])
LOG_train_outputs = copy.deepcopy(LOG_scaled_train_list[i][::-1].iloc[1:, :])
LOG_test_inputs = copy.deepcopy(LOG_scaled_test_list[i][::-1][:-1])
LOG_test_outputs = copy.deepcopy(LOG_scaled_test_list[i][::-1].iloc[1:, :])
# Random seed for reproducibility
np.random.seed(45)
# Build the model architecture
LOG_model = logistic_regression_model(output_size=7, neurons=30)
# Train the model
trained_model = LOG_model.fit(LOG_train_inputs, LOG_train_outputs, epochs=20,
batch_size=1, verbose=2, shuffle=True, validation_split=0.05)
trainScore = LOG_model.evaluate(LOG_train_inputs, LOG_train_outputs, verbose=0)
testScore = LOG_model.evaluate(LOG_test_inputs, LOG_test_outputs, verbose=0)
print("Mean Squared Error on the training data: {0:0.5f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.5f}".format(testScore))
df1 = pd.DataFrame(data=(np.transpose(LOG_model.predict(LOG_norm_train_list[i][:-1].values)))[0],
index=LOG_train_inputs.index, columns=['Predictions on the Train set'])
df2 = pd.DataFrame(data=(np.transpose(LOG_model.predict(LOG_norm_test_list[i][:-1].values)))[0],
index=LOG_test_inputs.index, columns=['Predictions on the Test set'])
df3 = pd.DataFrame(data=norm_stock_prices[i+1].loc[:, 'Adj Close'])
df3.columns = ['Actual Data']
name = get_ticker(LOG_norm_train_list[i])
plot_3zoom(df1[::-1], df2[::-1], df3,
title='Logistic Regression Performance on the Training and Test Sets, ' + name,
xlabel='Date', ylabel='Price', zoom=False)
nbr += 1
print('======================================================================================================')
print('======================================================================================================')
print('Total run time in seconds: {0:0.0f}'.format(time.time()-global_time))
plot_some_LOG_models()
# Define the LSTM model
def LSTM_model(inputs, output_size, neurons, activ_func="linear",
dropout=0.5, loss="mean_squared_error", optimizer="adam"):
model = Sequential()
model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2])))
#model.add(Activation('tanh'))
model.add(Dropout(dropout))
model.add(Dense(units=output_size))
#model.add(LeakyReLU())
model.add(Activation(activ_func))
model.compile(loss=loss, optimizer=optimizer)
model.summary()
return model
# worse with sigmoid activation function
# tanh resulted in overfitting (better result on the training data and worse on the test data)
#roughly same training time as before
# Create a copy to keep scaled and normalized data apart. [Have to use copy.deepcopy()]
scaled_norm_stock_prices = copy.deepcopy(norm_stock_prices)
LSTM_train_list, LSTM_test_list = [], [] # Create lists to store the train and test dataframes
norm_train_list, norm_test_list = [], []
# Create train and test sets for the stocks
for stock_price in range(1, len(norm_stock_prices)):
test_size = int(len(scaled_norm_stock_prices[stock_price]) * 0.20)
# Normalized
norm_train = norm_stock_prices[stock_price][test_size:]
norm_test = norm_stock_prices[stock_price][0:test_size]
norm_train_list.append(norm_train)
norm_test_list.append(norm_test)
# Normalized and scaled data
MMscale_data(scaled_norm_stock_prices[stock_price])
LSTM_train = scaled_norm_stock_prices[stock_price][test_size:]
LSTM_test = scaled_norm_stock_prices[stock_price][0:test_size]
# save each one into a list
LSTM_train_list.append(LSTM_train)
LSTM_test_list.append(LSTM_test)
print("Training samples: {0}".format(len(LSTM_train)))
print("Test samples: {0}".format(len(LSTM_test)))
print(LSTM_train.shape)
# convert an array of values into a dataset matrix.
# window, is the number of previous time steps to use as input variables to predict the next time period
def create_LSTM_dataset(dataset, window=10):
# dataset is an array
dataX = [dataset[i:(i+window), :] for i in range(len(dataset)-window)]
dataY = [dataset[j + window, 4] for j in range(len(dataset)-window)]
return np.array(dataX), np.array(dataY)
# create_LSTM_dataset and create_LSTM_dataset2 produce the exact same results.
def create_LSTM_dataset2(dataset, window=10):
# dataset is an array. window is the number of historical datapoints the predictions
# are based on while pred_len is the prediction length.
dataX, dataY = [], []
for i in range(len(dataset)-window):
dataX.append(dataset[i:(i+window), :])
dataY.append(dataset[i+window, 4])
return np.array(dataX), np.array(dataY)
# (LSTM's apperently work best with time steps in the size of 200-400 steps. I'll opt for 200. )
# Check if the scaling went as expected.
display(scaled_norm_stock_prices[10].head())
display(norm_stock_prices[10].head())
Specify how many days our model will base its predictions on by changing the window parameter.
# The predictions are far more reliable when using the scaled input data rather than the unscaled for the LSTM model.
# (train_scaled and test_scaled are far better than train and test). The difference in loss after 5 epochs is
# 1/250th in favour for the scaled values.
window=1
# get ticker
tick = get_ticker(LSTM_train_list[0])
"""Create the datasets"""
LSTM_train_input, LSTM_train_output = create_LSTM_dataset(LSTM_train_list[0].values, window)
LSTM_test_input, LSTM_test_output = create_LSTM_dataset(LSTM_test_list[0].values, window)
'''reshape the input to be [samples, time steps, features]'''
LSTM_test_input = np.reshape(LSTM_test_input, (LSTM_test_input.shape[0], LSTM_test_input.shape[1], 7))
LSTM_train_input = np.reshape(LSTM_train_input, (LSTM_train_input.shape[0], LSTM_train_input.shape[1], 7))
print(LSTM_train_input.shape)
print(LSTM_train_output.shape)
print('-----------')
print(LSTM_test_input.shape)
print(LSTM_test_output.shape)
# Check whether two arrays are equal
#print(np.array_equal(LSTM_train_input, testx))
#print(np.array_equal(LSTM_train_output, testy))
# Random seed for reproducibility
np.random.seed(2)
# Create the model
model = LSTM_model(LSTM_train_input, output_size = 1, neurons=20)
trained_LSTM = model.fit(LSTM_train_input, LSTM_train_output, epochs=20,
batch_size=1, verbose=1, shuffle=True, validation_split=0.05)
plot_error(trained_LSTM)
trainScore = model.evaluate(LSTM_train_input, LSTM_train_output, verbose=0)
testScore = model.evaluate(LSTM_test_input, LSTM_test_output, verbose=0)
print("Mean Squared Error on the training data: {0:0.6f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.6f}".format(testScore))
# A function for plotting three dataframes and a zoomed in plot
def plot_LSTM(data_df1, data_df2, data_df3, window_length, title='', xlabel='', ylabel='', zoom=True):
"""data_df1 contains the train set, data_df2 contains the test set and data_df3 contains the entire dataset.
title is the plot title. If a zoomed in window is desired, set zoom to True"""
line_w, line_zoom = 1.0, 1.5
plt.figure(figsize=(10, 5))
# Plot the predicted train and test data
pl = data_df1.plot(color='orchid', fontsize=12, figsize=(16, 7), label=data_df1.columns[0], linewidth=line_w)
diff = len(data_df3)-len(data_df2)-len(data_df1)
pred = np.empty_like(data_df3)
pred[:, :] = np.nan
pred[len(data_df1)+diff:len(data_df3), :] = data_df2
###pred[len(data_df1)+window_length+1:len(data_df3), :] = data_df2
plt.plot(pred, color='darkorange', label=data_df2.columns[0], linewidth=line_w)
# Plot the actual values
plt.plot(data_df3, color='green', label=data_df3.columns[0], linewidth=line_w)
pl.set_title(label=title, fontsize=20)
pl.set_xlabel(xlabel, fontsize=15)
plt.autoscale(enable=True, axis='x', tight=True)
pl.set_ylabel(ylabel, fontsize=15)
plt.legend(fontsize=12, loc='upper left')
plt.grid(axis='both', alpha=.5)
pl.xaxis.set_major_locator(MaxNLocator(12))
pl.xaxis.set_major_formatter(IndexFormatter(data_df3.index[::-1]))
plt.xticks(rotation=50, horizontalalignment='center', rotation_mode='default')
if zoom:
## The zoomed in window
lg = int(len(data_df3)*0.1)
axins = zoomed_inset_axes(pl, 2, loc=9)
axins.plot(data_df1, color='orchid', linewidth=line_zoom)
axins.plot(pred, color='darkorange', label=data_df2.columns[0], linewidth=line_zoom)
axins.plot(data_df3.loc[::-1], color='green', linewidth=line_zoom)
x1, x2 = data_df1.index[-lg//2,], data_df2.index[lg] # specify the limits
# Check for the max and min y values in the actual values, within the x limits.
yA1 = data_df3.loc[data_df3.index[test_size-lg]:data_df3.index[test_size+lg//2],'Actual Data'].min()
yA2 = data_df3.loc[data_df3.index[test_size-lg]:data_df3.index[test_size+lg//2],'Actual Data'].max()
# Check for the max and min y values in the train set, within the x limits.
yTr1 = data_df1.iloc[-lg//2:, 0].min()
yTr2 = data_df1.iloc[-lg//2:, 0].max()
# Check for the max and min y values in the test set, within the x limits.
yTe1 = data_df2.iloc[:lg, 0].min()
yTe2 = data_df2.iloc[:lg, 0].max()
ys = [yA1, yA2, yTr1, yTr2, yTe1, yTe2]
ymax, ymin = max(ys), min(ys) # find the max and min values among the different y's
axins.set_xlim(x1, x2), axins.set_ylim(ymin, ymax) # apply the x-limits, apply the y-limits
#plt.yticks(visible=False), plt.xticks(visible=False) # Remove the tickers
axins.set_facecolor('whitesmoke')
axins.axis[:].set_visible(False) # Remove the 4 borders
mark_inset(pl, axins, loc1=2, loc2=4, fc="none", ec="1.5") # Add some lines for the zoom effect
plt.show()
# Prediction output har alltid 1 column (jag valde ju det när LSTM designades).
# Make predictions for the train set. Then invert the scaling.
LSTM_train_pred = model.predict(copy.deepcopy(LSTM_train_input))
LSTM_train_pred = Un_scale_data(copy.deepcopy(LSTM_train_pred), tick)
LSTM_train_output = Un_scale_data(copy.deepcopy(LSTM_train_output), tick)
# Make predictions for the test set. Then invert the scaling.
LSTM_test_pred = model.predict(copy.deepcopy(LSTM_test_input))
LSTM_test_pred = Un_scale_data(copy.deepcopy(LSTM_test_pred), tick)
LSTM_test_output = Un_scale_data(copy.deepcopy(LSTM_test_output), tick)
print(LSTM_train_pred.shape)
print(LSTM_train_output.shape)
print(LSTM_test_pred.shape)
print(LSTM_test_output.shape)
df1 = pd.DataFrame(data=LSTM_train_pred, index=LSTM_train_list[0].index[:-window],
columns=['LSTM Predictions on Train set'])
df2 = pd.DataFrame(data=LSTM_test_pred, index=LSTM_test_list[0].index[:-window],
columns=['LSTM Predictions on Test Set'])
df3 = pd.DataFrame(data=norm_stock_prices[1].loc[:, 'Adj Close'][:-window], index=glob_index[:-window])
df3.columns = ['Actual Data']
name = get_ticker(LSTM_train_list[0])
plot_LSTM(df1[::-1], df2[::-1], df3, window_length=window,
title= 'LSTM Single Day Performance on the Training and Test Sets, ' + name,
xlabel='Date', ylabel='Price', zoom=True)
Our LSTM seems to predict the changes in the stock price fairly well. Even very well on the training set actually. However, this isn't surprising given that it is the data that the model has been training on. More importantly is how it performs on the unseen test data (orange) where it still gives some quite acceptable predictions. There are a few misses but at large, the results are good.
# def many_LSTM_models(nbr_of_plots=3)
def many_LSTM_models():
global_time = time.time()
nbr = 1
window=1
for i in range(1, len(LSTM_train_list)):
print('===================')
print('Plot: {0} (out of {1})'.format(nbr, len(LSTM_train_list)-1))
print('===================')
LSTM_train_input, LSTM_train_output = create_LSTM_dataset(LSTM_train_list[i].values, window)
LSTM_test_input, LSTM_test_output = create_LSTM_dataset(LSTM_test_list[i].values, window)
'''reshape the input to be [samples, time steps, features]'''
LSTM_test_input = np.reshape(LSTM_test_input, (LSTM_test_input.shape[0], LSTM_test_input.shape[1], 7))
LSTM_train_input = np.reshape(LSTM_train_input, (LSTM_train_input.shape[0], LSTM_train_input.shape[1], 7))
# Random seed for reproducibility
np.random.seed(2)
model = LSTM_model(LSTM_train_input, output_size = 1, neurons=20)
trained_LSTM = model.fit(LSTM_train_input, LSTM_train_output, epochs=20,
batch_size=1, verbose=1, shuffle=True, validation_split=0.05)
trainScore = model.evaluate(LSTM_train_input, LSTM_train_output, verbose=0)
testScore = model.evaluate(LSTM_test_input, LSTM_test_output, verbose=0)
print("Mean Squared Error on the training data: {0:0.6f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.6f}".format(testScore))
tick = get_ticker(LSTM_train_list[i])
# Make predictions for the train set. Then invert the scaling.
LSTM_train_pred = model.predict(copy.deepcopy(LSTM_train_input))
LSTM_train_pred = Un_scale_data(copy.deepcopy(LSTM_train_pred), tick)
LSTM_train_output = Un_scale_data(copy.deepcopy(LSTM_train_output), tick)
# Make predictions for the test set. Then invert the scaling.
LSTM_test_pred = model.predict(copy.deepcopy(LSTM_test_input))
LSTM_test_pred = Un_scale_data(copy.deepcopy(LSTM_test_pred), tick)
LSTM_test_output = Un_scale_data(copy.deepcopy(LSTM_test_output), tick)
df1 = pd.DataFrame(data=LSTM_train_pred, index=LSTM_train_list[i].index[:-window],
columns=['LSTM Predictions on Train set'])
df2 = pd.DataFrame(data=LSTM_test_pred, index=LSTM_test_list[i].index[:-window],
columns=['LSTM Predictions on Test Set'])
df3 = pd.DataFrame(data=norm_stock_prices[1+i].loc[:, 'Adj Close'][:-window], index=glob_index[:-window])
df3.columns = ['Actual Data']
name = get_ticker(LSTM_train_list[i])
plot_LSTM(df1[::-1], df2[::-1], df3, window_length=window,
title= 'LSTM Single Day Performance on the Training and Test Sets, ' + name,
xlabel='Date', ylabel='Price', zoom=True)
nbr += 1
print('======================================================================================================')
print('======================================================================================================')
print('Total run time in seconds: {0:0.0f}'.format(time.time()-global_time))
#many_LSTM_models()
from keras import regularizers
# Define the LSTM model
def LSTM10_model(inputs, output_size, neurons, activ_func="linear",
dropout=0.2, loss="mean_squared_error", optimizer="rmsprop"):
model = Sequential()
model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons*2, return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(units=output_size))
model.add(Activation(activ_func))
model.compile(loss=loss, optimizer=optimizer)
model.summary()
return model
def create_LSTM10_dataset(dataset, window, pred_len=10):
# dataset is an array. window is the number of historical datapoints the predictions
# are based on while pred_len is the prediction length.
dataX, dataY = [], []
for i in range(len(dataset)-window-pred_len+1):
dataX.append(dataset[i:(i+window), :])
dataY.append(dataset[(i + window):(i + window + pred_len), 4])
return np.array(dataX), np.array(dataY)
# Specify for how many days we want to predict the price by changing the into_the_future parameter.
into_the_future = 10
""""Define the input data for the 10 days LSTM prediction"""
window=10
LSTM10_train_input, LSTM10_train_output = create_LSTM_dataset(LSTM_train_list[0].values, window)
LSTM10_test_input, LSTM10_test_output = create_LSTM_dataset(LSTM_test_list[0].values, window)
'''reshape the input to be [samples, time steps, features]'''
LSTM10_test_input = np.reshape(LSTM10_test_input, (LSTM10_test_input.shape[0], LSTM10_test_input.shape[1], 7))
LSTM10_train_input = np.reshape(LSTM10_train_input, (LSTM10_train_input.shape[0], LSTM10_train_input.shape[1], 7))
print(LSTM10_train_input.shape)
print(LSTM10_test_input.shape)
print('--------------------')
print(LSTM10_train_output.shape)
print(LSTM10_test_output.shape)
# Random seed for reproducibility
#np.random.seed(17)
np.random.seed(202)
model_10 = LSTM10_model(LSTM10_train_input, output_size = 1, neurons=50)
start = time.time()
trained_LSTM10 = model_10.fit(LSTM10_train_input, LSTM10_train_output,
epochs=1, batch_size=2, verbose=1, shuffle=True, validation_split=0.05)
print('Total training time (s): {0:0.0f}'.format(time.time()-start))
#plot_error(trained_LSTM10)
trainScore = model_10.evaluate(LSTM10_train_input, LSTM10_train_output, verbose=0)
testScore = model_10.evaluate(LSTM10_test_input, LSTM10_test_output, verbose=0)
print("Mean Squared Error on the training data: {0:0.6f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.6f}".format(testScore))
# Prediction output har alltid 1 column (jag valde ju det när LSTM designades).
# Make predictions for the train set. Then invert the scaling.
#LSTM10_train_pred = model_10.predict(LSTM10_train_input[:-into_the_future])
#print(LSTM10_train_pred.shape)
#LSTM10_train_pred = Un_scale_data(LSTM10_train_pred, tickr)
#LSTM10_train_output = Un_scale_data(LSTM10_train_output, tickr) ##
Let's focus on what's important and interesting, namely the performance on the test set.
### DESSA TVÅ FUNKTIONER MÅSTE JAG ÄNDRA! JAG HAR KOPIERAT DEM RAKT AV.
def plot_long_pred(pred_data, true_data, pred_len, title='', xlabel='', ylabel=''):
""" Plot the predictions stored in pred_data and the true values stored in true_data.
pred_len is the length of each prediction. """
index = true_data.index
fig = plt.figure(figsize=(16, 7), facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data.values[:, 0][::-1], label='True Data')
#Pad the list of predictions to shift it in the graph to its correct start
for i, data in enumerate(pred_data):
padding = [None for p in range(i * pred_len)]
#Show the legend only for the first 5 predictions
if i < 5:
plt.plot((padding + data), label='Prediction')
plt.legend()
else:
plt.plot(padding + data)
ax.set_title(label=title, fontsize=20)
ax.set_xlabel(xlabel, fontsize=15)
ax.autoscale(enable=True, axis='x', tight=True)
ax.set_ylabel(ylabel, fontsize=15)
ax.grid(axis='both', alpha=.5)
ax.xaxis.set_major_locator(MaxNLocator(12))
ax.xaxis.set_major_formatter(IndexFormatter(index[::-1]))
plt.setp(ax.get_xticklabels(), rotation=50, fontsize=12)
plt.setp(ax.get_yticklabels(), fontsize=12)
plt.show()
def predict_multiple_sequences(model, data, window_size, pred_len):
""" Make a sequence of predictions of pred_len steps before shifting prediction run forward by pred_len steps."""
prediction_seqs = []
for i in range(int(len(data)/pred_len)):
curr_frame = data[i*pred_len]
predicted = []
for j in range(pred_len):
predicted.append(model.predict(curr_frame[np.newaxis,:,:])[0,0])
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
prediction_seqs.append(predicted)
return prediction_seqs
tickr = get_ticker(LSTM_train_list[0])
LSTM10_predictions = predict_multiple_sequences(model_10, LSTM10_test_input[::-1], window, into_the_future)
#inv_LSTM10_predictions = copy.deepcopy(pd.DataFrame(LSTM10_predictions).transpose())
#for i in range(len(LSTM10_predictions)):
# Un_scale_data(inv_LSTM10_predictions.iloc[:, i], tickr)
# #column = Un_scale_data(copy.deepcopy(column.values), tickr)
## inv_LSTM10_predictions.append(list1.tolist())
plot_long_pred(LSTM10_predictions, LSTM_test_list[0], into_the_future,
title='10 day predictions on test set, ' + tickr, xlabel='Date', ylabel='Price')
Run the cell below to find the optimal algorithm/tuning parameters for each stock. (The run will take several 10:s of hours.)
from keras import regularizers
# Define the LSTM model
def LSTM10_model2(inputs, output_size, neurons, activ_func="linear",
dropout=0.2, loss="mean_squared_error", optimizer="rmsprop"):
model = Sequential()
model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons*2, return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(units=output_size))
model.add(Activation(activ_func))
model.compile(loss=loss, optimizer=optimizer)
model.summary()
return model
def many_LSTM10():
global_start_time = time.time()
windows = [10, 20]
into_the_future = 10
start = 0
count = 1
for stock_nbr in range(1):
for window in windows:
for batch_size in [1, 2, 10, 50, 100]:
for epoch in [1, 2]:
""""Define the input data for the 10 day LSTM prediction"""
LSTM10_train_input, LSTM10_train_output = create_LSTM_dataset(LSTM_train_list[stock_nbr].values, window)
LSTM10_test_input, LSTM10_test_output = create_LSTM_dataset(LSTM_test_list[stock_nbr].values, window)
'''reshape the input to be [samples, time steps, features]'''
LSTM10_test_input = np.reshape(LSTM10_test_input, (LSTM10_test_input.shape[0],
LSTM10_test_input.shape[1], 7))
LSTM10_train_input = np.reshape(LSTM10_train_input, (LSTM10_train_input.shape[0],
LSTM10_train_input.shape[1], 7))
print('===============================')
print('(Stock, window, batch_size, epoch)')
print('{0}, {1}, {2}, {3}'.format(get_ticker(LSTM_train_list[stock_nbr]),
window, batch_size, epoch))
print('Run: {0} ({1})'.format(count, 1*2*len([1, 2, 10, 50, 100])*2))
print('===============================')
# Random seed for reproducibility
np.random.seed(202)
model_10 = LSTM10_model2(LSTM10_train_input, output_size = 1, neurons=50)
trained_LSTM10 = model_10.fit(LSTM10_train_input, LSTM10_train_output, epochs=epoch,
batch_size=batch_size, verbose=1,
shuffle=True, validation_split=0.05)
#plot_error(trained_LSTM10)
trainScore = model_10.evaluate(LSTM10_train_input, LSTM10_train_output, verbose=0)
testScore = model_10.evaluate(LSTM10_test_input, LSTM10_test_output, verbose=0)
print("Mean Squared Error on the training data: {0:0.6f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.6f}".format(testScore))
tickr = get_ticker(LSTM_train_list[stock_nbr])
LSTM10_predictions = predict_multiple_sequences(model_10, LSTM10_test_input[::-1],
window, into_the_future)
plot_long_pred(LSTM10_predictions, LSTM_test_list[stock_nbr], into_the_future,
title='10 day predictions on test set, ' + tickr, xlabel='Date', ylabel='Price')
count += 1
print('==================================================================')
print('==================================================================')
print('Total training time (s): {0:0.0f}'.format(time.time()-global_start_time))
#many_LSTM10()
###### from keras import regularizers
# Define the LSTM model
#def LSTM10_model2(inputs, output_size, neurons, activ_func="linear",
# dropout=0.2, loss="mean_squared_error", optimizer="rmsprop"):
# model = Sequential()
# model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2]), return_sequences=True))
# model.add(Dropout(dropout))
# model.add(LSTM(neurons*2, return_sequences=False))
# model.add(Dropout(dropout))
# model.add(Dense(units=output_size))
# model.add(Activation(activ_func))
# model.compile(loss=loss, optimizer=optimizer)
# model.summary()
# return model
#def many_LSTM10():
# global_start_time = time.time()
# windows = [10, 20]
# into_the_future = 10
# start = 0
# count = 1
# stocks_to_change = [5, 12, 30, 49, 51, 61, 54]
# for stock_nbr in stocks_to_change:
# for window in windows:
# for batch_size in [1, 150, 200]:
# for epoch in [5, 10]:
# """"Define the input data for the 10 day LSTM prediction"""
# LSTM10_train_input, LSTM10_train_output = create_LSTM_dataset(LSTM_train_list[stock_nbr].values, window)
# LSTM10_test_input, LSTM10_test_output = create_LSTM_dataset(LSTM_test_list[stock_nbr].values, window)
# '''reshape the input to be [samples, time steps, features]'''
# LSTM10_test_input = np.reshape(LSTM10_test_input, (LSTM10_test_input.shape[0], LSTM10_test_input.shape[1], 7))
# LSTM10_train_input = np.reshape(LSTM10_train_input, (LSTM10_train_input.shape[0], LSTM10_train_input.shape[1], 7))
# print('===============================')
# print('(Stock, window, batch_size, epoch)')
# print('{0}, {1}, {2}, {3}'.format(get_ticker(LSTM_train_list[stock_nbr]), window, batch_size, epoch))
# print('Run: {0} ({1})'.format(count, len(stocks_to_change)*2*len([1, 150, 200])*2))
# print('===============================')
# # Random seed for reproducibility
# np.random.seed(202)
# model_10 = LSTM10_model2(LSTM10_train_input, output_size = 1, neurons=50)
# trained_LSTM10 = model_10.fit(LSTM10_train_input, LSTM10_train_output, epochs=epoch,
# batch_size=batch_size, verbose=1, shuffle=True, validation_split=0.05)
# #plot_error(trained_LSTM10)
# trainScore = model_10.evaluate(LSTM10_train_input, LSTM10_train_output, verbose=0)
# testScore = model_10.evaluate(LSTM10_test_input, LSTM10_test_output, verbose=0)
# print("Mean Squared Error on the training data: {0:0.6f}".format(trainScore))
# print("Mean Squared Error on the test data: {0:0.6f}".format(testScore))
# tickr = get_ticker(LSTM_train_list[stock_nbr])
# LSTM10_predictions = predict_multiple_sequences(model_10, LSTM10_test_input[::-1], window, into_the_future)
# plot_long_pred(LSTM10_predictions, LSTM_test_list[stock_nbr], into_the_future,
# title='10 day predictions on test set, ' + tickr, xlabel='Date', ylabel='Price')
# count += 1
# print('==================================================================')
# print('==================================================================')
# print('Total training time (s): {0:0.0f}'.format(time.time()-global_start_time))
The optimal tuning parameters are stored in the algorithm_tunings array. (Stock: Window size, batch size, number of epochs)
#(Stock: Window size, batch size, number of epochs)
algorithm_tunings = {'ACAN-B.ST': [10, 1, 1], 'ANOD-B.ST': [10, 50, 1], 'ADDT-B.ST':[10, 2, 1],'AOI.ST':[10, 50, 1],
'AQ.ST':[10, 50, 1], 'ARCM.ST':[20, 100, 1], 'BEIA-B.ST': [20, 50, 1], 'BEIJ-B.ST': [10, 50, 1],
'BIOG-B.ST':[20, 10, 1], 'BIOT.ST':[10, 50, 1], 'PXXS-SDB.ST':[20, 2, 1], 'BULTEN.ST':[20, 50, 1],
'BURE.ST':[20, 50, 2], 'BMAX.ST':[10, 100, 2], 'CAT-A.ST':[10, 50, 1], 'CAT-B.ST': [10, 100, 2],
'CATE.ST': [10, 100, 2], 'CCC.ST': [10, 10, 1], 'CEVI.ST': [10, 50, 1], 'CLAS-B.ST': [20, 50, 2],
'CLA-B.ST': [10, 100, 2], 'COIC.ST': [20, 50, 1], 'CRED-A.ST':[20, 100, 1], 'DIOS.ST':[20, 50, 1],
'DUNI.ST':[10, 100, 2], 'ELAN-B.ST':[20, 100, 1], 'ENQ.ST': [10, 10, 1], 'FAG.ST':[20, 100, 1],
'FPAR.ST':[20, 50, 1], 'G5EN.ST':[20, 2, 1], 'GUNN.ST':[20, 50, 1], 'HLDX.ST':[10, 10, 1],
'HMED.ST':[20, 100, 1], 'HEBA-B.ST':[20, 100, 2], 'HIQ.ST':[20, 100, 2], 'HMS.ST':[10, 50, 1],
'IAR-B.ST':[10, 50, 1], 'IVSO.ST':[10, 50, 2], 'KABE-B.ST':[10, 50, 1], 'KAHL.ST':[10, 100, 2],
'KARO.ST':[20, 50, 1], 'KNOW.ST':[10, 100, 1], 'LIAB.ST':[10, 100, 2], 'LUC.ST':[20, 10, 1],
'MVIR-B.ST':[10, 2, 2], 'MEKO.ST':[10, 100, 2], 'MSON-A.ST':[10, 100, 1],'MSON-B.ST':[10, 100, 2],
'MYCR.ST':[10, 100, 2], 'NMAN.ST':[10, 100, 1], 'NETI-B.ST':[20, 50, 2], 'NEWA-B.ST':[10, 100, 2],
'NOLA-B.ST':[10, 100, 1], 'OEM-B.ST':[10, 50, 1], 'OPUS.ST':[10, 100, 2], 'ORX.ST':[10, 1, 1],
'PROB.ST':[10, 10, 1], 'QLRO.ST':[20, 2, 2], 'RAY-B.ST':[10, 100, 2], 'REZT.ST':[10, 10, 2],
'SAS.ST':[10, 100, 2], 'SMF.ST':[10, 1, 2], 'SKIS-B.ST':[10, 50, 1], 'STAR-B.ST':[10, 2, 1],
'SWOL-B.ST':[20, 100, 1], 'SYSR.ST':[20, 50, 1], 'TETY.ST':[10, 100, 1], 'TRAC-B.ST':[20, 100, 2],
'VBG-B.ST':[20, 2, 1], 'VITR.ST':[10, 50, 1], 'XVIVO.ST':[20, 50, 2], 'ORES.ST':[20, 50, 2]}
def create_pred_path(ticker):
"""Create a file path to store file(s)"""
base = '/Users/jakob/Desktop/Programming/Udacity Machine Learning Nano Degree/Capstone Project/Predictions/'
return(base + ticker + '_Predictions' + '.csv')
Make predictions for all the stocks using each individual algorithm. Save the predictions to both an array and a .csv file.
def make_all_10pred():
all_predictions = {}
global_start_time = time.time()
into_the_future = 10
count = 1
print('Creating predictions... ')
print()
for stock_nbr in range(len(LSTM_train_list)):
stock_ticker = get_ticker(LSTM_train_list[stock_nbr])
stock_info = algorithm_tunings[stock_ticker]
window, batch_size, epoch = stock_info[0], stock_info[1], stock_info[2]
print('===============================')
print('(Stock, window, batch_size, epoch)')
print('{0}, {1}, {2}, {3}'.format(stock_ticker, window, batch_size, epoch))
print('Count: {0} ({1})'.format(count, len(LSTM_train_list)))
print('===============================')
""""Define the input data for the 10 day LSTM prediction"""
LSTM10_train_input, LSTM10_train_output = create_LSTM_dataset(LSTM_train_list[stock_nbr].values, window)
LSTM10_test_input, LSTM10_test_output = create_LSTM_dataset(LSTM_test_list[stock_nbr].values, window)
'''reshape the input to be [samples, time steps, features]'''
LSTM10_test_input = np.reshape(LSTM10_test_input, (LSTM10_test_input.shape[0], LSTM10_test_input.shape[1], 7))
LSTM10_train_input = np.reshape(LSTM10_train_input, (LSTM10_train_input.shape[0], LSTM10_train_input.shape[1], 7))
# Random seed for reproducibility
np.random.seed(202)
model_10 = LSTM10_model(LSTM10_train_input, output_size = 1, neurons=50)
trained_LSTM10 = model_10.fit(LSTM10_train_input, LSTM10_train_output, epochs=epoch,
batch_size=batch_size, verbose=1, shuffle=True, validation_split=0.05)
#plot_error(trained_LSTM10)
trainScore = model_10.evaluate(LSTM10_train_input, LSTM10_train_output, verbose=0)
testScore = model_10.evaluate(LSTM10_test_input, LSTM10_test_output, verbose=0)
print("Mean Squared Error on the training data: {0:0.6f}".format(trainScore))
print("Mean Squared Error on the test data: {0:0.6f}".format(testScore))
#tickr = get_ticker(LSTM_train_list[stock_nbr])
LSTM10_predictions = predict_multiple_sequences(model_10, LSTM10_test_input[::-1], window, into_the_future)
# Save the predictions to a .csv file
pd.DataFrame(LSTM10_predictions).to_csv(create_pred_path(stock_ticker))
# Save the predictions to an array
all_predictions[stock_ticker] = LSTM10_predictions
count +=1
print('======================================================================================')
print('======================================================================================')
print()
print('...Done!')
print('Total run time (s): {0:0.0f}'.format(time.time()-global_start_time))
return all_predictions
# predictions_10 contains all the predictions for each stock. (The run will take roughly an hour.)
#predictions_10 = make_all_10pred()
#print(pd.DataFrame(predictions_10['ACAN-B.ST']))
def get_predictions():
di = '/Users/jakob/Desktop/Programming/Udacity Machine Learning Nano Degree/Capstone Project/Predictions/'
filePaths = glob(di+"*.csv") # Get each .csv file in the directory
# Get all the file names
file_names_pred = []
for root, dirs, files in os.walk(di):
for filename in files:
filename = filename[:-4] # Just keep the ticker name, without the .csv file extention
file_names_pred.append(filename)
del file_names_pred[0]
# Get the predictions from the .csv files
predictions = []
for i in range(len(filePaths)):
predi = pd.read_csv(filePaths[i])
predi.drop(predi.columns[[0]], axis=1, inplace=True)
predi.index.names = [file_names_pred[i]]
predictions.append(predi)
return predictions
predictions_10 = get_predictions()
def calc_accuracy(predictions, true_data):
# predictions is a 2D dataframe with predicted values. true_data is a dataframe with the true data
correct, not_correct = 0, 0
rows, columns = predictions.shape[0], predictions.shape[1]
for row in range(rows):
if (predictions.iloc[row,0] < predictions.iloc[row, -1]) and (true_data.iloc[row*columns-columns, 4] < true_data.iloc[row*columns, 4]):
correct += 1
elif (predictions.iloc[row,0] > predictions.iloc[row, -1]) and (true_data.iloc[row*columns-columns, 4] > true_data.iloc[row*columns, 4]):
correct += 1
else:
not_correct += 1
return (correct / rows)
def get_top_predictions(top_x, pred_list, true_data_list):
"""top_x is the amount of top predictions desired to be returned by the function.
pred_list is a list containing all the predictions for all the stocks.
true_data_list is a list containing all the true data, stored in df"""
cor_inc_pred = {}
for pred in pred_list:
row, col = pred.shape[0], pred.shape[1]
tkr = get_ticker(pred)[:-6]
true_data = get_stock(true_data_list, tkr)[::-1]
if (pred.iloc[-1, -1] > pred.iloc[-1, 0]) and (true_data.iloc[row*col, 4] > true_data.iloc[row*col-col, 4]):
cor_inc_pred[tkr] = ((pred.iloc[-1, -1] - pred.iloc[-1, 0]) / pred.iloc[-1, 0])
sorted_cor_inc_pred = sorted(cor_inc_pred.items(), key=operator.itemgetter(1), reverse=True)
if len(sorted_cor_inc_pred) < top_x:
print('Fewer correct top predictions were found than asked for. Returning all that were found.\n')
return sorted_cor_inc_pred[:top_x]
def plot_all_10pred(predictions):
"""Plot all the predicted values"""
into_the_future, good_acc = 10, 0
for predi in predictions:
tkr = get_ticker(predi)[:-6]
true_values = get_stock(LSTM_test_list, tkr)
plot_long_pred(predi.values.tolist(), true_values, into_the_future,
title='10 day predictions on test set, ' + tkr, xlabel='Date', ylabel='Price')
accuracy = calc_accuracy(predi, true_values)
if accuracy >= 0.6:
good_acc += 1
print('Accuracy score for {0}: {1:0.2f}%'.format(tkr, accuracy*100))
print('==================================================================================================')
print('==================================================================================================')
print()
print('Total number of satisfying predictions (over 60%): {0} out of {1}'.format(good_acc, len(predictions)))
#plot_all_10pred(predictions_10)
top_pred = get_top_predictions(5, predictions_10, LSTM_test_list)
print(top_pred)